import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import sklearn
import torch
# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
# embedding
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedderimports
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
df=df_input.copy()
mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
df["merchant"].values.tolist()))}
df["from"]=df["cc_num"].apply(lambda x:mapping[x]) #엣지의 출발점
df["to"]=df["merchant"].apply(lambda x:mapping[x]) #엣지의 도착점
df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label") #엣지 속성 설정,각 속성의 사기 여부부
nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액
return G
def build_graph_tripartite(df_input, graph_type=nx.Graph()):
df=df_input.copy()
mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() +
df["cc_num"].values.tolist() +
df["merchant"].values.tolist()))}
df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
[(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
return G
def down_sample_textbook(df):
df_majority = df[df.is_fraud==0].copy()
df_minority = df[df.is_fraud==1].copy()
df_maj_dowsampled = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
df_downsampled = pd.concat([df_minority, df_maj_dowsampled])
return df_downsampled
def embedding(Graph):
# Graph -> X (feature)
_edgs = list(Graph.edges)
subGraph = Graph.edge_subgraph([_edgs[x] for x in range(len(Graph.edges))]).copy()
subGraph.add_nodes_from(list(set(Graph.nodes) - set(subGraph.nodes)))
embedded = AverageEmbedder(Node2Vec(subGraph, weight_key='weight').fit(window=10).wv)
X = [embedded[str(_edgs[x][0]), str(_edgs[x][1])] for x in range(len(Graph.edges))]
# Graph -> y (label)
y = np.array(list(nx.get_edge_attributes(Graph, "label").values()))
return X,y
def anal(df):
Graph = build_graph_bipartite(df)
X,XX,y,yy = embedding(Graph)
lrnr = RandomForestClassifier(n_estimators=100, random_state=42)
lrnr.fit(X,y)
yyhat = lrnr.predict(XX)
df = pd.DataFrame({
'acc':[sklearn.metrics.accuracy_score(yy,yyhat)],
'pre':[sklearn.metrics.precision_score(yy,yyhat)],
'rec':[sklearn.metrics.recall_score(yy,yyhat)],
'f1':[sklearn.metrics.f1_score(yy,yyhat)]}
)
return df
def our_sampling1(df):
cus_list = set(df.query('is_fraud==1').cc_num.tolist())
return df.query("cc_num in @ cus_list")fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain| trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | street | city | ... | lat | long | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2019-01-01 00:00:00 | 2.703190e+15 | fraud_Rippin, Kub and Mann | misc_net | 4.97 | Jennifer | Banks | F | 561 Perry Cove | Moravian Falls | ... | 36.0788 | -81.1781 | 3495 | Psychologist, counselling | 1988-03-09 | 0b242abb623afc578575680df30655b9 | 1325376018 | 36.011293 | -82.048315 | 0 |
| 1 | 2019-01-01 00:00:00 | 6.304230e+11 | fraud_Heller, Gutmann and Zieme | grocery_pos | 107.23 | Stephanie | Gill | F | 43039 Riley Greens Suite 393 | Orient | ... | 48.8878 | -118.2105 | 149 | Special educational needs teacher | 1978-06-21 | 1f76529f8574734946361c461b024d99 | 1325376044 | 49.159047 | -118.186462 | 0 |
| 2 | 2019-01-01 00:00:00 | 3.885950e+13 | fraud_Lind-Buckridge | entertainment | 220.11 | Edward | Sanchez | M | 594 White Dale Suite 530 | Malad City | ... | 42.1808 | -112.2620 | 4154 | Nature conservation officer | 1962-01-19 | a1a22d70485983eac12b5b88dad1cf95 | 1325376051 | 43.150704 | -112.154481 | 0 |
| 3 | 2019-01-01 00:01:00 | 3.534090e+15 | fraud_Kutch, Hermiston and Farrell | gas_transport | 45.00 | Jeremy | White | M | 9443 Cynthia Court Apt. 038 | Boulder | ... | 46.2306 | -112.1138 | 1939 | Patent attorney | 1967-01-12 | 6b849c168bdad6f867558c3793159a81 | 1325376076 | 47.034331 | -112.561071 | 0 |
| 4 | 2019-01-01 00:03:00 | 3.755340e+14 | fraud_Keeling-Crist | misc_pos | 41.96 | Tyler | Garcia | M | 408 Bradley Rest | Doe Hill | ... | 38.4207 | -79.4629 | 99 | Dance movement psychotherapist | 1986-03-28 | a41d7549acf90789359a9aa5346dcb46 | 1325376186 | 38.674999 | -78.632459 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1048570 | 2020-03-10 16:07:00 | 6.011980e+15 | fraud_Fadel Inc | health_fitness | 77.00 | Haley | Wagner | F | 05561 Farrell Crescent | Annapolis | ... | 39.0305 | -76.5515 | 92106 | Accountant, chartered certified | 1943-05-28 | 45ecd198c65e81e597db22e8d2ef7361 | 1362931649 | 38.779464 | -76.317042 | 0 |
| 1048571 | 2020-03-10 16:07:00 | 4.839040e+15 | fraud_Cremin, Hamill and Reichel | misc_pos | 116.94 | Meredith | Campbell | F | 043 Hanson Turnpike | Hedrick | ... | 41.1826 | -92.3097 | 1583 | Geochemist | 1999-06-28 | c00ce51c6ebb7657474a77b9e0b51f34 | 1362931670 | 41.400318 | -92.726724 | 0 |
| 1048572 | 2020-03-10 16:08:00 | 5.718440e+11 | fraud_O'Connell, Botsford and Hand | home | 21.27 | Susan | Mills | F | 005 Cody Estates | Louisville | ... | 38.2507 | -85.7476 | 736284 | Engineering geologist | 1952-04-02 | 17c9dc8b2a6449ca2473726346e58e6c | 1362931711 | 37.293339 | -84.798122 | 0 |
| 1048573 | 2020-03-10 16:08:00 | 4.646850e+18 | fraud_Thompson-Gleason | health_fitness | 9.52 | Julia | Bell | F | 576 House Crossroad | West Sayville | ... | 40.7320 | -73.1000 | 4056 | Film/video editor | 1990-06-25 | 5ca650881b48a6a38754f841c23b77ab | 1362931718 | 39.773077 | -72.213209 | 0 |
| 1048574 | 2020-03-10 16:08:00 | 2.283740e+15 | fraud_Buckridge PLC | misc_pos | 6.81 | Shannon | Williams | F | 9345 Spencer Junctions Suite 183 | Alpharetta | ... | 34.0770 | -84.3033 | 165556 | Prison officer | 1997-12-27 | 8d0a575fe635bbde12f1a2bffc126731 | 1362931730 | 33.601468 | -83.891921 | 0 |
1048575 rows × 22 columns
시도
_df1 = fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df2 = fraudTrain[fraudTrain["is_fraud"] == 1]
df02 = pd.concat([_df1,_df2])
df02.shape(214520, 22)
df50 = down_sample_textbook(df02)
df50.shape(12012, 22)
12012*12012144288144
고려할 것(230810)
df50 의 shape이 12000개 이므로 9000개의 T, 3000개의 F를 train mask로 만들자.
고객정보가 동일하면 edge를 1로, 아니면 0으로 놓고 1에대한 weight를 만들자.
g(V,E,W)에서의 weight
df50 = df50.reset_index()N = len(df50)tr/test
df50_tr,df50_test = sklearn.model_selection.train_test_split(df50, random_state=42)df50_tr.is_fraud.mean().round(5), df50_test.is_fraud.mean().round(5)(0.49828, 0.50516)
df50_tr.shape, df50_test.shape((9009, 23), (3003, 23))
train_mask = np.concatenate((np.full(9009, True), np.full(3003, False)))
test_mask = np.concatenate((np.full(9009, False), np.full(3003, True)))
print("Train Mask:", train_mask)
print("Test Mask:", test_mask)Train Mask: [ True True True ... False False False]
Test Mask: [False False False ... True True True]
train_mask.shape, test_mask.shape((12012,), (12012,))
train_mask.sum(), test_mask.sum()(9009, 3003)
df50_com = pd.concat([df50_tr, df50_test])df50_com = df50_com.reset_index()df50_com| level_0 | index | trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | ... | lat | long | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 476 | 51331 | 2019-01-31 00:44:00 | 3.543590e+15 | fraud_Medhurst PLC | shopping_net | 921.24 | Margaret | Lam | F | ... | 40.4603 | -79.0097 | 922 | Early years teacher | 1972-10-04 | c8928ba53be26fdd997b26f7130c757e | 1327970678 | 40.064488 | -78.210499 | 1 |
| 1 | 3671 | 625691 | 2019-09-23 00:09:00 | 2.610530e+15 | fraud_Torphy-Goyette | shopping_pos | 698.28 | Tanya | Dickerson | F | ... | 36.2416 | -86.6117 | 22191 | Prison officer | 1994-07-27 | 90453290b765904ed1c3426882a6788b | 1348358993 | 35.884288 | -87.513318 | 1 |
| 2 | 6641 | 896244 | 2019-12-25 21:30:00 | 6.011330e+15 | fraud_Monahan-Morar | personal_care | 220.56 | Lauren | Butler | F | ... | 36.0557 | -96.0602 | 413574 | Teacher, special educational needs | 1971-09-01 | 4072a3effcf51cf7cf88f69d00642cd9 | 1356471044 | 35.789798 | -95.859736 | 0 |
| 3 | 4288 | 717690 | 2019-11-02 22:22:00 | 6.011380e+15 | fraud_Daugherty, Pouros and Beahan | shopping_pos | 905.43 | Martin | Duarte | M | ... | 44.6001 | -84.2931 | 864 | General practice doctor | 1942-05-04 | f2fa1b25eef2f43fa5c09e3e1bfe7f77 | 1351894926 | 44.652759 | -84.500359 | 1 |
| 4 | 4770 | 815813 | 2019-12-08 02:50:00 | 4.430880e+15 | fraud_Hudson-Ratke | grocery_pos | 307.98 | Alicia | Morales | F | ... | 39.3199 | -106.6596 | 61 | Public relations account executive | 1939-11-04 | f06eff8da349e36e623cff026de8e970 | 1354935056 | 38.389399 | -106.111026 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 12007 | 56 | 12246 | 2019-01-08 01:50:00 | 4.859530e+15 | fraud_Howe Ltd | misc_pos | 664.73 | Rebecca | Farley | F | ... | 42.0716 | -75.0152 | 1228 | Psychotherapist, child | 1990-02-25 | 61817f427fdb2a54d7c51595026021d2 | 1325987429 | 41.124211 | -75.954718 | 1 |
| 12008 | 3194 | 538327 | 2019-08-18 23:37:00 | 5.020130e+11 | fraud_Miller-Harris | misc_net | 844.60 | Sherry | Martinez | F | ... | 42.6315 | -75.1866 | 165 | Naval architect | 1945-09-20 | 635ba4a5f582514e053e96bf3a4376ac | 1345333050 | 42.207966 | -74.695138 | 1 |
| 12009 | 2855 | 459431 | 2019-07-22 03:18:00 | 4.917190e+15 | fraud_Corwin-Collins | gas_transport | 17.97 | Joel | Rivera | M | ... | 35.8759 | -96.9623 | 1165 | Psychotherapist, child | 1944-11-11 | f9526787905f648773a69e1f97faa017 | 1342927100 | 34.880538 | -96.384044 | 1 |
| 12010 | 10690 | 720826 | 2019-11-03 21:51:00 | 4.997730e+15 | fraud_Hagenes, Hermann and Stroman | travel | 7.58 | Stephanie | Taylor | F | ... | 44.9913 | -92.9487 | 753116 | Fisheries officer | 1971-08-06 | 3cd0cc36fa115887dba94c1d5b3fb2df | 1351979471 | 44.177391 | -92.998310 | 0 |
| 12011 | 2986 | 489045 | 2019-08-02 01:42:00 | 2.450830e+15 | fraud_Herman, Treutel and Dickens | misc_net | 824.99 | Timothy | Kirby | M | ... | 45.6040 | -94.1591 | 16163 | Hydrographic surveyor | 1987-02-22 | 5832beb3af071da9ddd41d9ff8f7a5a1 | 1343871750 | 44.785690 | -93.624590 | 1 |
12012 rows × 24 columns
이건 weight?
# edge_index_list = []
# for i in range(N):
# for j in range(N):
# time_difference = (df50['trans_date_trans_time'][i] - df50['trans_date_trans_time'][j]).total_seconds()
# edge_index_list.append([i, j, time_difference])# edge_index_list[:5][[0, 0, 0.0],
[0, 1, -2460.0],
[0, 2, -7140.0],
[0, 3, -9120.0],
[0, 4, -10140.0]]
# np.save('edge_index_list_50.npy', edge_index_list)
# loaded_data = np.load('edge_index_list_50.npy')# edge_index = np.array(edge_index_list)
# edge_index[:,2] = np.abs(edge_index[:,2])
# theta = edge_index[:,2].mean()
# theta12238996.895508753
# edge_index[:,2] = (np.exp(-edge_index[:,2]/theta)!=1) * np.exp(-edge_index[:,2]/theta)
# edge_indexarray([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[0.00000000e+00, 1.00000000e+00, 9.99799023e-01],
[0.00000000e+00, 2.00000000e+00, 9.99416789e-01],
...,
[1.20110000e+04, 1.20090000e+04, 4.19756312e-01],
[1.20110000e+04, 1.20100000e+04, 2.26811434e-01],
[1.20110000e+04, 1.20110000e+04, 0.00000000e+00]])
# edge_index[:,2]array([0. , 0.99979902, 0.99941679, ..., 0.41975631, 0.22681143,
0. ])
Q. 그런데 밑에서 random으로 train하고 test로 나누게 되면.. wieght랑 edge를 어떻게 적용시키지?
edge: 같은 cc_num이면 edge=1, 다르면 edge=0
edge_index_list2_com = []
for i in range(N):
for j in range(N):
if df50_com['cc_num'][i] != df50_com['cc_num'][j]:
edge = 0
else:
edge = 1
edge_index_list2_com.append([i, j, edge])np.save('edge_index_list2_50_com.npy', edge_index_list2_com)
loaded_data = np.load('edge_index_list2_50_com.npy')edge_index_list2_com[:5][[0, 0, 1], [0, 1, 0], [0, 2, 0], [0, 3, 0], [0, 4, 0]]
edge_one_com = [(i, j) for i, j, edge in edge_index_list2_com if edge == 1]
edge_one_com[:5][(0, 0), (0, 344), (0, 1377), (0, 1447), (0, 1639)]
len(edge_one_com)200706
edge_one_index_com = torch.tensor(edge_one_com, dtype=torch.long).t()edge_one_index_com.shapetorch.Size([2, 200706])
data설정(x, edge_index, y)
x = df50_com['amt']a = torch.tensor(x, dtype=torch.float)a = a.reshape(-1,1)
atensor([[921.2400],
[698.2800],
[220.5600],
...,
[ 17.9700],
[ 7.5800],
[824.9900]])
y = df50_com['is_fraud']b = torch.tensor(y,dtype=torch.int64)btensor([1, 1, 0, ..., 1, 0, 1])
import torch_geometricdata = torch_geometric.data.Data(x=a, edge_index = edge_one_index_com, y=b, train_mask = train_mask, test_mask = test_mask)dataData(x=[12012, 1], edge_index=[2, 200706], y=[12012], train_mask=[12012], test_mask=[12012])
gnn
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
class GCN(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv1 = GCNConv(1, 16)
self.conv2 = GCNConv(16,2)
def forward(self, data):
x, edge_index = data.x, data.edge_index
x = self.conv1(x, edge_index)
x = F.relu(x)
x = F.dropout(x, training=self.training)
x = self.conv2(x, edge_index)
return F.log_softmax(x, dim=1)model = GCN()modelGCN(
(conv1): GCNConv(1, 16)
(conv2): GCNConv(16, 2)
)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
model.train()GCN(
(conv1): GCNConv(1, 16)
(conv2): GCNConv(16, 2)
)
for epoch in range(200):
optimizer.zero_grad()
out = model(data)
loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
loss.backward()
optimizer.step()model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')Accuracy: 0.5012
왜 자꾸 accuracy가 안좋게 나오노
data.y[data.test_mask].sum()tensor(1517)
out[data.train_mask]tensor([[-1.0039, -0.4564],
[-1.6018, -0.2251],
[-0.3896, -1.1311],
...,
[-1.7755, -0.1856],
[-0.9515, -0.4880],
[-2.3393, -0.1014]], grad_fn=<IndexBackward0>)
data.y[data.test_mask]tensor([0, 1, 0, ..., 1, 0, 1])
음……………. edge_list를 다시 해보자.
!!!! 첫번째 시도에서는 edge_list를 무작정 1인걸 고른것이 아니였는데 왜 여기서는 이러고 있엇니……………..